########################################################
# Project:    Commission Communication
# Task:       Extract language-based indicators from PRs
# Author:     Christian Rauh (12.01.2021)
########################################################

# Packages ####
library(tidyverse) # 1.3.0
library(quanteda) # 3.2.0
library(sophistication) # 0.70 https://github.com/kbenoit/sophistication
library(spacyr) # 1.2.1
library(textcat) # 1.0-7

# Edited functions from sophistication package
# which excludes stopwords from calculating term familiarity
source("./Tools/covars_make_baselines_CR.R")



# Commission press releases ####
# ~ 1h on Dell 

start <- Sys.time()

# Clean version of the Comm IP corpus
corpus <- read_rds("./Corpora/EC-PressReleases_1985-2020_clean.RDS")

# Sample for testing
# corpus <- corpus %>% sample_n(100, replace = FALSE)

# Identifier variable
corpus$cid <- 1:nrow(corpus)

# Quanteda corpus object
qcorp <- corpus(corpus$text, docvars = data.frame(corpus[, c("cid", "year")]))
docids <- docvars(qcorp) %>% 
  mutate(doc_id = as.character(docid(qcorp)))


# Reading ease scores, based on sophistication package
re <- covars_make(qcorp, readability_measure = "Flesch") 
re$doc_id <- paste0("text",rownames(re))

# Google N-Gram familiarity measures
# Based on modified function from sophistication package
fam <- covars_make_baselines_CR(qcorp, baseline_year = docvars(qcorp, "year"))
fam$doc_id <- rownames(fam)

# Part-of-speech distributions (sophistication/spacyr)
# Puts out doc_id itself, order not necessarily correct
pos <- covars_make_pos(qcorp)

# Combine data

indicators <- merge(docids[ ,c("doc_id", "cid")],
                    re[, c("doc_id", "meanSentenceLength", "Flesch")],
                    by = "doc_id", all.x = T) %>% 
  rename(flesch = Flesch)

indicators <- merge(indicators,
                    fam[, c("doc_id","google_mean_local")], 
                    by = "doc_id", all.x = T) %>% 
  rename(familiarity = google_mean_local)

indicators <- merge(indicators,
                    pos[, c("doc_id","n_namedentities", "n_noun", "n_verb", "n_sentence", "ntoken")],
                    by = "doc_id", all.x = T)

corpus <- merge(corpus, indicators, by = "cid", all.x = T) %>% 
  mutate(nominal = n_noun/n_verb) %>% 
  select(-c(doc_id, title, lead, body, text))


# Export
write_rds(corpus, "./Data/PR-Comm_Language.Rds")

end <- Sys.time()
duration1 <- end-start
duration1





# UK government press releases ####

start <- Sys.time()

# Clean version of the UK PR corpus
corpus <- read_rds("./Corpora/UK-GovPressReleases.Rds") %>% 
  filter(speech == F,
         str_count(text, "\\. ") > 2, # faulty scrapes do not contain multiple sentences
         str_detect(text, "(T|t)he "),
         !str_detect(text, "PDF, [0-9]{1,4}KB"), # Mostly reference to files only
         !str_detect(text, "HTML Details"), # Same
         !str_detect(text, "Kick off time")) # Lists of football travel advice

# Sample for testing
# corpus <- corpus %>% sample_n(100, replace = FALSE)

# Harmonize vars
corpus$text <- paste0(corpus$headline, ". ", corpus$text) %>% 
  str_remove(" Curriculum Vitae .*?$")
corpus$year <- as.numeric(str_extract(as.character(corpus$date), "[0-9]{4}"))

# Identifier variable
corpus$cid <- 1:nrow(corpus)

# Quanteda corpus object
qcorp <- corpus(corpus$text, docvars = data.frame(corpus[, c("cid", "year")]))
docids <- docvars(qcorp) %>% 
  mutate(doc_id = as.character(docid(qcorp)))


# Reading ease scores, based on sophistication package
re <- covars_make(qcorp, readability_measure = "Flesch") 
re$doc_id <- paste0("text",rownames(re))

# Google N-Gram familiarity measures
# Based on modified function from sophistication package
fam <- covars_make_baselines_CR(qcorp, baseline_year = docvars(qcorp, "year"))
fam$doc_id <- rownames(fam)

# Part-of-speech distributions (sophistication/spacyr)
# Puts out doc_id itself, order not necessarily correct
pos <- covars_make_pos(qcorp)

# Combine data

indicators <- merge(docids[ ,c("doc_id", "cid")],
                    re[, c("doc_id", "meanSentenceLength", "Flesch")],
                    by = "doc_id", all.x = T) %>% 
  rename(flesch = Flesch)

indicators <- merge(indicators,
                    fam[, c("doc_id","google_mean_local")], 
                    by = "doc_id", all.x = T) %>% 
  rename(familiarity = google_mean_local)

indicators <- merge(indicators,
                    pos[, c("doc_id","n_namedentities", "n_noun", "n_verb", "n_sentence", "ntoken")],
                    by = "doc_id", all.x = T)

corpus <- merge(corpus, indicators, by = "cid", all.x = T) %>% 
  mutate(nominal = n_noun/n_verb) %>% 
  select(-c(doc_id, headline, author, url, speech, text))


# Export
write_rds(corpus, "./Data/PR-UK_Language.Rds")

end <- Sys.time()
duration2 <- end-start
duration2




# IRE government press releases ####

start <- Sys.time()

# Clean version of the IRE PR corpus
corpus <- read_rds("./Corpora/IRE-GovPressReleases.Rds")

# Sample for testing
# corpus <- corpus %>% sample_n(100, replace = FALSE)

# Harmonize vars
corpus$text <- paste0(corpus$headline, ". ", corpus$text) %>% 
  str_remove(fixed("Share . Email . Facebook . Twitter")) %>% 
  str_remove_all(fixed("•"))
corpus$year <- as.numeric(str_extract(as.character(corpus$date), "[0-9]{4}"))

# Identifier variable
corpus$cid <- 1:nrow(corpus)

# Filter non-english sentences
# Irish politicians often use Irish bits which would offset the grammar and lang indicators

corpus$text2 <- NA

for (i in 1:nrow(corpus)){
  
  # Progress
  print(round((i/nrow(corpus))*100, 2))
  
  # Sentence tokenizer
  df <- spacy_tokenize(corpus$text[i], what = "sentence") %>% 
    data.frame() %>% 
    rename(sentences = 1)
  
  # Language detection
  df$lang <- textcat(df$sentences)
  
  # Drop non-english sentences
  df <- df %>% filter(lang == "english")
  
  # Rebuild and store text
  corpus$text2[i] <- paste(df$sentences, collapse = " ")
}

sum(corpus$text2 == "")


# Quanteda corpus object
qcorp <- corpus(corpus$text2, docvars = data.frame(corpus[, c("cid", "year")]))
docids <- docvars(qcorp) %>% 
  mutate(doc_id = as.character(docid(qcorp)))

# Reading ease scores, based on sophistication package
re <- covars_make(qcorp, readability_measure = "Flesch") 
re$doc_id <- paste0("text",rownames(re))

# Google N-Gram familiarity measures
# Based on modified function from sophistication package
fam <- covars_make_baselines_CR(qcorp, baseline_year = docvars(qcorp, "year"))
fam$doc_id <- rownames(fam)

# Part-of-speech distributions (sophistication/spacyr)
# Puts out doc_id itself, order not necessarily correct
pos <- covars_make_pos(qcorp)

# Combine data

indicators <- merge(docids[ ,c("doc_id", "cid")],
                    re[, c("doc_id", "meanSentenceLength", "Flesch")],
                    by = "doc_id", all.x = T) %>% 
  rename(flesch = Flesch)

indicators <- merge(indicators,
                    fam[, c("doc_id","google_mean_local")], 
                    by = "doc_id", all.x = T) %>% 
  rename(familiarity = google_mean_local)

indicators <- merge(indicators,
                    pos[, c("doc_id","n_namedentities", "n_noun", "n_verb", "n_sentence", "ntoken")],
                    by = "doc_id", all.x = T)

corpus <- merge(corpus, indicators, by = "cid", all.x = T) %>% 
  mutate(nominal = n_noun/n_verb) %>% 
  select(-c(doc_id, headline, author, url, text))


# Export
write_rds(corpus, "./Data/PR-IRE_Language.Rds")

end <- Sys.time()
duration3 <- end-start
duration3